{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据获取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"./data/otto/train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       1       0       0       0       0       0       0       0       0   \n",
       "1   2       0       0       0       0       0       0       0       1       0   \n",
       "2   3       0       0       0       0       0       0       0       1       0   \n",
       "3   4       1       0       0       1       6       1       5       0       0   \n",
       "4   5       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \\\n",
       "0  ...        1        0        0        0        0        0        0   \n",
       "1  ...        0        0        0        0        0        0        0   \n",
       "2  ...        0        0        0        0        0        0        0   \n",
       "3  ...        0        1        2        0        0        0        0   \n",
       "4  ...        1        0        0        0        0        1        0   \n",
       "\n",
       "   feat_92  feat_93   target  \n",
       "0        0        0  Class_1  \n",
       "1        0        0  Class_1  \n",
       "2        0        0  Class_1  \n",
       "3        0        0  Class_1  \n",
       "4        0        0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(61878, 95)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.00000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>30939.500000</td>\n",
       "      <td>0.38668</td>\n",
       "      <td>0.263066</td>\n",
       "      <td>0.901467</td>\n",
       "      <td>0.779081</td>\n",
       "      <td>0.071043</td>\n",
       "      <td>0.025696</td>\n",
       "      <td>0.193704</td>\n",
       "      <td>0.662433</td>\n",
       "      <td>1.011296</td>\n",
       "      <td>...</td>\n",
       "      <td>0.070752</td>\n",
       "      <td>0.532306</td>\n",
       "      <td>1.128576</td>\n",
       "      <td>0.393549</td>\n",
       "      <td>0.874915</td>\n",
       "      <td>0.457772</td>\n",
       "      <td>0.812421</td>\n",
       "      <td>0.264941</td>\n",
       "      <td>0.380119</td>\n",
       "      <td>0.126135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>17862.784315</td>\n",
       "      <td>1.52533</td>\n",
       "      <td>1.252073</td>\n",
       "      <td>2.934818</td>\n",
       "      <td>2.788005</td>\n",
       "      <td>0.438902</td>\n",
       "      <td>0.215333</td>\n",
       "      <td>1.030102</td>\n",
       "      <td>2.255770</td>\n",
       "      <td>3.474822</td>\n",
       "      <td>...</td>\n",
       "      <td>1.151460</td>\n",
       "      <td>1.900438</td>\n",
       "      <td>2.681554</td>\n",
       "      <td>1.575455</td>\n",
       "      <td>2.115466</td>\n",
       "      <td>1.527385</td>\n",
       "      <td>4.597804</td>\n",
       "      <td>2.045646</td>\n",
       "      <td>0.982385</td>\n",
       "      <td>1.201720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>15470.250000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>30939.500000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>46408.750000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61.00000</td>\n",
       "      <td>51.000000</td>\n",
       "      <td>64.000000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>55.000000</td>\n",
       "      <td>65.000000</td>\n",
       "      <td>67.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>61.000000</td>\n",
       "      <td>130.000000</td>\n",
       "      <td>52.000000</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>87.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 94 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id       feat_1        feat_2        feat_3        feat_4  \\\n",
       "count  61878.000000  61878.00000  61878.000000  61878.000000  61878.000000   \n",
       "mean   30939.500000      0.38668      0.263066      0.901467      0.779081   \n",
       "std    17862.784315      1.52533      1.252073      2.934818      2.788005   \n",
       "min        1.000000      0.00000      0.000000      0.000000      0.000000   \n",
       "25%    15470.250000      0.00000      0.000000      0.000000      0.000000   \n",
       "50%    30939.500000      0.00000      0.000000      0.000000      0.000000   \n",
       "75%    46408.750000      0.00000      0.000000      0.000000      0.000000   \n",
       "max    61878.000000     61.00000     51.000000     64.000000     70.000000   \n",
       "\n",
       "             feat_5        feat_6        feat_7        feat_8        feat_9  \\\n",
       "count  61878.000000  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean       0.071043      0.025696      0.193704      0.662433      1.011296   \n",
       "std        0.438902      0.215333      1.030102      2.255770      3.474822   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        0.000000      0.000000      0.000000      1.000000      0.000000   \n",
       "max       19.000000     10.000000     38.000000     76.000000     43.000000   \n",
       "\n",
       "       ...       feat_84       feat_85       feat_86       feat_87  \\\n",
       "count  ...  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean   ...      0.070752      0.532306      1.128576      0.393549   \n",
       "std    ...      1.151460      1.900438      2.681554      1.575455   \n",
       "min    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "25%    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "50%    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "75%    ...      0.000000      0.000000      1.000000      0.000000   \n",
       "max    ...     76.000000     55.000000     65.000000     67.000000   \n",
       "\n",
       "            feat_88       feat_89       feat_90       feat_91       feat_92  \\\n",
       "count  61878.000000  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean       0.874915      0.457772      0.812421      0.264941      0.380119   \n",
       "std        2.115466      1.527385      4.597804      2.045646      0.982385   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        1.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "max       30.000000     61.000000    130.000000     52.000000     19.000000   \n",
       "\n",
       "            feat_93  \n",
       "count  61878.000000  \n",
       "mean       0.126135  \n",
       "std        1.201720  \n",
       "min        0.000000  \n",
       "25%        0.000000  \n",
       "50%        0.000000  \n",
       "75%        0.000000  \n",
       "max       87.000000  \n",
       "\n",
       "[8 rows x 94 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/python/.local/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n",
      "  FutureWarning\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEHCAYAAABvHnsJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAakUlEQVR4nO3dfbReZXnn8e+PULDWakAiYhIbRoNTpGohFaaMtUIHgtMa2oIDrZIqbWYqWGsdFWuXsLTM0tYO4yuuVCLQuoxIW4kdFDP4QsdlgIC8oyUiSlKQ1CDYOqDRa/7Y94HH9Jxw2Jznec7xfD9rPevsfe177/vaJyfnOvd+TVUhSVIfe4w7AUnS3GURkST1ZhGRJPVmEZEk9WYRkST1tue4Exi1/fbbr5YtWzbuNCRpTrnmmmv+uaoW7Rqfd0Vk2bJlbN68edxpSNKckuTrk8U9nCVJ6s0iIknqzSIiSerNIiJJ6s0iIknqzSIiSerNIiJJ6m1oRSTJuiT3JLlpl/irk3w5yc1J/mwg/qYkW5J8JcmxA/GVLbYlyRkD8QOTXNniH02y17D2RZI0uWGORM4HVg4GkrwIWAU8t6qeDbyzxQ8GTgKe3dZ5f5IFSRYA7wOOAw4GTm5tAd4BnFNVzwTuBU4d4r5IkiYxtDvWq+qKJMt2Cf8+8PaqerC1uafFVwHrW/xrSbYAz2/LtlTV7QBJ1gOrktwKHAX8VmtzAXAWcO6QdmekvvHWnxt5n09/y40j71PS3DfqcyIHAS9oh6E+n+QXWnwxcOdAu60tNlX8ycC3q2rnLvFJJVmTZHOSzdu3b5+hXZEkjbqI7AnsCxwBvB64KEmG3WlVra2qFVW1YtGif/P8MElST6N+AONW4G+re7H7VUl+COwHbAOWDrRb0mJMEf8WsDDJnm00MthekjQiox6JfBx4EUCSg4C9gH8GNgAnJdk7yYHAcuAq4GpgebsSay+6k+8bWhH6LHBC2+5q4JJR7ogkaYgjkSQfAX4Z2C/JVuBMYB2wrl32+z1gdSsINye5CLgF2AmcVlU/aNs5HbgMWACsq6qbWxdvBNYn+VPgS8B5w9oXSdLkhnl11slTLHrZFO3PBs6eJH4pcOkk8dt5+AouSdIYeMe6JKk3i4gkqTeLiCSpN4uIJKk3i4gkqTeLiCSpN4uIJKk3i4gkqTeLiCSpN4uIJKk3i4gkqTeLiCSpN4uIJKk3i4gkqTeLiCSpN4uIJKk3i4gkqbehFZEk65Lc016Fu+uy1yWpJPu1+SR5d5ItSW5IcuhA29VJbmuf1QPxw5Lc2NZ5d5IMa18kSZMb2utxgfOB9wIXDgaTLAWOAb4xED4OWN4+hwPnAocn2Zfu3ewrgAKuSbKhqu5tbX4PuJLu9bkrgU8OcX+kOensl50wln7f/NcXj6VfjdbQRiJVdQWwY5JF5wBvoCsKE1YBF1ZnE7AwyQHAscDGqtrRCsdGYGVb9sSq2lRVRVeojh/WvkiSJjfScyJJVgHbqur6XRYtBu4cmN/aYruLb50kPlW/a5JsTrJ5+/btj2EPJEmDRlZEkjwe+GPgLaPqc0JVra2qFVW1YtGiRaPuXpJ+bI1yJPIM4EDg+iR3AEuAa5M8FdgGLB1ou6TFdhdfMklckjRCIysiVXVjVT2lqpZV1TK6Q1CHVtXdwAbglHaV1hHAfVV1F3AZcEySfZLsQ3dC/rK27P4kR7Srsk4BLhnVvkiSOsO8xPcjwBeBZyXZmuTU3TS/FLgd2AL8JfAqgKraAbwNuLp93tpitDYfbOt8Fa/MkqSRG9olvlV18iMsXzYwXcBpU7RbB6ybJL4ZOOSxZSlJeiy8Y12S1JtFRJLUm0VEktSbRUSS1JtFRJLUm0VEktSbRUSS1JtFRJLUm0VEktSbRUSS1JtFRJLUm0VEktSbRUSS1JtFRJLUm0VEktSbRUSS1JtFRJLU2zBfj7suyT1JbhqI/XmSLye5IcnfJVk4sOxNSbYk+UqSYwfiK1tsS5IzBuIHJrmyxT+aZK9h7YskaXLDHImcD6zcJbYROKSqngP8I/AmgCQHAycBz27rvD/JgiQLgPcBxwEHAye3tgDvAM6pqmcC9wK7e4e7JGkIhlZEquoKYMcusU9X1c42uwlY0qZXAeur6sGq+hqwBXh++2ypqtur6nvAemBVkgBHARe39S8Ajh/WvkiSJjfOcyKvBD7ZphcDdw4s29piU8WfDHx7oCBNxCeVZE2SzUk2b9++fYbSlySNpYgkeTOwE/jwKPqrqrVVtaKqVixatGgUXUrSvLDnqDtM8jvArwJHV1W18DZg6UCzJS3GFPFvAQuT7NlGI4PtJUkjMtKRSJKVwBuAl1TVdwcWbQBOSrJ3kgOB5cBVwNXA8nYl1l50J983tOLzWeCEtv5q4JJR7YckqTPMS3w/AnwReFaSrUlOBd4L/DSwMcl1ST4AUFU3AxcBtwCfAk6rqh+0UcbpwGXArcBFrS3AG4E/SrKF7hzJecPaF0nS5IZ2OKuqTp4kPOUv+qo6Gzh7kvilwKWTxG+nu3pLkjQm3rEuSerNIiJJ6s0iIknqzSIiSerNIiJJ6s0iIknqzSIiSerNIiJJ6s0iIknqzSIiSerNIiJJ6s0iIknqzSIiSerNIiJJ6s0iIknqzSIiSeptmG82XJfkniQ3DcT2TbIxyW3t6z4tniTvTrIlyQ1JDh1YZ3Vrf1uS1QPxw5Lc2NZ5d5IMa18kSZMb5kjkfGDlLrEzgMurajlweZsHOI7uverLgTXAudAVHeBM4HC6txieOVF4WpvfG1hv174kSUM2tCJSVVcAO3YJrwIuaNMXAMcPxC+sziZgYZIDgGOBjVW1o6ruBTYCK9uyJ1bVpqoq4MKBbUmSRmTU50T2r6q72vTdwP5tejFw50C7rS22u/jWSeKTSrImyeYkm7dv3/7Y9kCS9JCxnVhvI4gaUV9rq2pFVa1YtGjRKLqUpHlh1EXkm+1QFO3rPS2+DVg60G5Ji+0uvmSSuCRphEZdRDYAE1dYrQYuGYif0q7SOgK4rx32ugw4Jsk+7YT6McBlbdn9SY5oV2WdMrAtSdKI7DmsDSf5CPDLwH5JttJdZfV24KIkpwJfB17aml8KvBjYAnwXeAVAVe1I8jbg6tburVU1cbL+VXRXgP0k8Mn2kSSN0NCKSFWdPMWioydpW8BpU2xnHbBukvhm4JDHkqMk6bHxjnVJUm8WEUlSbxYRSVJv0yoiSS6fTkySNL/s9sR6kscBj6e7wmofYOIhh09kN3eIS5Lmh0e6Ouu/An8IPA24hoeLyP3Ae4eXliRpLthtEamqdwHvSvLqqnrPiHKSJM0R07pPpKrek+QXgWWD61TVhUPKS5I0B0yriCT5K+AZwHXAD1p44hHskqR5arp3rK8ADm53lkuSBEz/PpGbgKcOMxFJ0twz3ZHIfsAtSa4CHpwIVtVLhpKVJGlOmG4ROWuYSUiS5qbpXp31+WEnIkmae6Z7ddZ3ePhVtnsBPwH8a1U9cViJSdKonXXWWfOq35kw3ZHIT09MtzcJrgKOGFZSkqS54VE/xbc6HweOnfl0JElzyXQPZ/3GwOwedPeNPNC30ySvBX6X7hDZjXSvwz0AWA88me45XS+vqu8l2ZvupsbDgG8B/6Wq7mjbeRNwKt0NkH9QVZf1zUmS9OhNdyTyawOfY4Hv0B3SetSSLAb+AFhRVYcAC4CTgHcA51TVM4F76YoD7eu9LX5Oa0eSg9t6zwZWAu9PsqBPTpKkfqZ7TuQVQ+j3J5N8n+5R83cBRwG/1ZZfQHdZ8bl0xeqsFr8YeO/AeZn1VfUg8LUkW4DnA1+c4VwlSVOY7uGsJcB7gCNb6B+A11TV1kfbYVVtS/JO4BvA/wM+TXf46ttVtbM128rD7ytZDNzZ1t2Z5D66Q16LgU0Dmx5cZ9f81wBrAJ7+9Kc/2pQFHPmeIx+50RB84dVfGEu/kqZnuoezPgRsoHuvyNOAT7TYo9ZebrUKOLBt66foDkcNTVWtraoVVbVi0aJFw+xKkuaV6RaRRVX1oara2T7nA31/G/8K8LWq2l5V3wf+lm6EszDJxMhoCbCtTW8DlgK05U+iO8H+UHySdSRJIzDdIvKtJC9LsqB9Xkb3i7yPbwBHJHl8O7dxNHAL8FnghNZmNXBJm97Q5mnLP9OeJrwBOCnJ3kkOBJYDV/XMSZLUw3SLyCuBlwJ3050EPwH4nT4dVtWVdCfIr6W7vHcPYC3wRuCP2gnyJwPntVXOA57c4n8EnNG2czNwEV0B+hRwWlX9AEnSyEz3AYxvBVZX1b0ASfYF3klXXB61qjoTOHOX8O10V1ft2vYB4MQptnM2cHafHCRJj910RyLPmSggAFW1A/j54aQkSZorpltE9mhXVQEPjUSmO4qRJP2Ymm4h+Avgi0k+1uZPxMNIkjTvTfeO9QuTbKa7qxzgN6rqluGlJUmaC6Z9SKoVDQuHJOkhj/pR8JIkTbCISJJ6s4hIknqziEiSerOISJJ6s4hIknqziEiSerOISJJ6s4hIknqziEiSerOISJJ6s4hIknobSxFJsjDJxUm+nOTWJP8hyb5JNia5rX3dp7VNkncn2ZLkhiSHDmxndWt/W5LVU/coSRqGcY1E3gV8qqr+PfBc4Fa6d6dfXlXLgcvbPMBxwPL2WQOcCw+9GOtM4HC61+qeOfjiLEnS8I28iCR5EvBLwHkAVfW9qvo2sAq4oDW7ADi+Ta8CLqzOJmBhkgOAY4GNVbWjvbp3I7ByZDsiSRrLSORAYDvwoSRfSvLBJD8F7F9Vd7U2dwP7t+nFwJ0D629tsani/0aSNUk2J9m8ffv2GdwVSZrfxlFE9gQOBc6tqp8H/pWHD10BUFUF1Ex1WFVrq2pFVa1YtGjRTG1Wkua9cRSRrcDWqrqyzV9MV1S+2Q5T0b7e05ZvA5YOrL+kxaaKS5JGZORFpKruBu5M8qwWOprutbsbgIkrrFYDl7TpDcAp7SqtI4D72mGvy4BjkuzTTqgf02KSpBGZ9jvWZ9irgQ8n2Qu4HXgFXUG7KMmpwNeBl7a2lwIvBrYA321tqaodSd4GXN3avbWqdoxuFyRJYykiVXUdsGKSRUdP0raA06bYzjpg3YwmJ0maNu9YlyT1ZhGRJPVmEZEk9WYRkST1ZhGRJPVmEZEk9WYRkST1Nq6bDSVJ03DRx54/ln5feuJV02rnSESS1JtFRJLUm0VEktSbRUSS1JtFRJLUm0VEktSbRUSS1JtFRJLU29iKSJIFSb6U5O/b/IFJrkyyJclH21sPSbJ3m9/Sli8b2MabWvwrSY4d065I0rw1zpHIa4BbB+bfAZxTVc8E7gVObfFTgXtb/JzWjiQHAycBzwZWAu9PsmBEuUuSGFMRSbIE+M/AB9t8gKOAi1uTC4Dj2/SqNk9bfnRrvwpYX1UPVtXX6N7BPp7nA0jSPDWukcj/At4A/LDNPxn4dlXtbPNbgcVtejFwJ0Bbfl9r/1B8knUkSSMw8iKS5FeBe6rqmhH2uSbJ5iSbt2/fPqpuJenH3jhGIkcCL0lyB7Ce7jDWu4CFSSaeKrwE2NamtwFLAdryJwHfGoxPss6PqKq1VbWiqlYsWrRoZvdGkuaxkReRqnpTVS2pqmV0J8Y/U1W/DXwWOKE1Ww1c0qY3tHna8s9UVbX4Se3qrQOB5cD0nl0sSZoRs+l9Im8E1if5U+BLwHktfh7wV0m2ADvoCg9VdXOSi4BbgJ3AaVX1g9GnLUnz11iLSFV9Dvhcm76dSa6uqqoHgBOnWP9s4OzhZShJ2h3vWJck9WYRkST1ZhGRJPVmEZEk9WYRkST1ZhGRJPVmEZEk9WYRkST1ZhGRJPU2mx57ImkeufXsz4y8z59981Ej7/PHnSMRSVJvjkQ0Z33+l144ln5feMXnp1z23td9YoSZPOz0v/i1sfQrORKRJPVmEZEk9WYRkST1ZhGRJPVmEZEk9TbyIpJkaZLPJrklyc1JXtPi+ybZmOS29nWfFk+SdyfZkuSGJIcObGt1a39bktVT9SlJGo5xjER2Aq+rqoOBI4DTkhwMnAFcXlXLgcvbPMBxwPL2WQOcC13RAc4EDqd7re6ZE4VHkjQaIy8iVXVXVV3bpr8D3AosBlYBF7RmFwDHt+lVwIXV2QQsTHIAcCywsap2VNW9wEZg5ej2RJI01nMiSZYBPw9cCexfVXe1RXcD+7fpxcCdA6ttbbGp4pP1sybJ5iSbt2/fPnM7IEnz3NjuWE/yBOBvgD+sqvuTPLSsqipJzVRfVbUWWAuwYsWKh7Z72OsvnKkuHpVr/vyUsfQrSTNtLCORJD9BV0A+XFV/28LfbIepaF/vafFtwNKB1Ze02FRxSdKIjOPqrADnAbdW1f8cWLQBmLjCajVwyUD8lHaV1hHAfe2w12XAMUn2aSfUj2kxSdKIjONw1pHAy4Ebk1zXYn8MvB24KMmpwNeBl7ZllwIvBrYA3wVeAVBVO5K8Dbi6tXtrVe0YyR5IkoAxFJGq+r9Aplh89CTtCzhtim2tA9bNXHaSpEfDO9YlSb1ZRCRJvVlEJEm9WUQkSb1ZRCRJvVlEJEm9WUQkSb1ZRCRJvVlEJEm9WUQkSb1ZRCRJvVlEJEm9WUQkSb1ZRCRJvVlEJEm9WUQkSb1ZRCRJvc35IpJkZZKvJNmS5Ixx5yNJ88mcLiJJFgDvA44DDgZOTnLweLOSpPljThcR4PnAlqq6vaq+B6wHVo05J0maN1JV486htyQnACur6nfb/MuBw6vq9F3arQHWtNlnAV+Zge73A/55BrYz02ZjXuY0PeY0fbMxrx/3nH6mqhbtGtxzhjY+q1XVWmDtTG4zyeaqWjGT25wJszEvc5oec5q+2ZjXfM1prh/O2gYsHZhf0mKSpBGY60XkamB5kgOT7AWcBGwYc06SNG/M6cNZVbUzyenAZcACYF1V3Tyi7mf08NgMmo15mdP0mNP0zca85mVOc/rEuiRpvOb64SxJ0hhZRCRJvVlEJEm9zcsikuSpSdYn+WqSa5JcmuSgJDcNud8Tk9yc5IdJVuyybFw5/XmSLye5IcnfJVk4C3J6W8vnuiSfTvK0XZaPJa+B/l+XpJLsN+6ckpyVZFv7Xl2X5MXjzqn1/er2c3Vzkj8bd05JPjrwPbojyXWzIKfnJdnUctqc5Pm7LB9XXs9N8sUkNyb5RJIn7naFqppXHyDAF4H/NhB7LvAC4KYh9/2zdHfMfw5YMUtyOgbYs02/A3jHLMjpiQPTfwB8YDZ8r1pfS+muBvw6sN+4cwLOAv77JPFx5vQi4P8Ae7f5p4w7p13y+wvgLePOCfg0cFybfjHwuVny73c18MI2/UrgbbtrPx9HIi8Cvl9VH5gIVNX1wJ0T80mWJfmHJNe2zy+2+AFJrmh/OdyU5AVJFiQ5v83fmOS1U3VcVbdW1WSPXBlnTp+uqp1tdhPdDZvjzun+gdmfAgYvIRxbXs05wBtmWU6TGWdOvw+8vaoebP3eMwtymth+gJcCH5kFORUw8Vf+k4B/Glg2zrwOAq5o0xuB39xN27l9n0hPhwDXPEKbe4D/VFUPJFlO9wO3Avgt4LKqOjvdE4QfDzwPWFxVhwBk4HDQHMzplcBHZ0NOSc4GTgHuo/sPNWFseSVZBWyrquu730Xjz6k5PckpwGbgdVV175hzOgh4Qfs3fIBupHT1mHOa8ALgm1V1W5sfZ05/CFyW5J10pxZ+cWDZOPO6me5Bth8HTuRHnwryb8zHkch0/ATwl0luBD5G95h56IZ5r0hyFvBzVfUd4Hbg3yV5T5KVwP2TbXC255TkzcBO4MOzIaeqenNVLW35nL67tqPIK8njgT8G3vIocxlaTs25wDPofkncRXeoZtw57QnsCxwBvB64KLtU3THkNOFkHh6FTNewcvp94LXt5/y1wHmzJK9XAq9Kcg3w08D3dpvFMI+tzcYPcDRwxSTxZbRjjXTHmSf+OtgT2DnQ7mnA7wHXAae02BPohnwfp7tr/pFy+Bw/ek5krDkBv0N3/PXxsyWnge08nYFjwOPKC/g5ur/87mifncA3gKfOou/VYH9jywn4FPCigfmvAovG/X1q2/smsGTcP0+t3X08fMN3gPtnQ1679HcQcNXu2szHkchngL3TPR4egCTP4UeHbE8C7qqqHwIvp3ukCkl+hm4o/JfAB4FD012hs0dV/Q3wJ8Chcymn9lfJG4CXVNV3Z0lOywdmVwFfHndeVXVjVT2lqpZV1TJgK3BoVd095u/VAQOzvw5MXLkzzp/zj9MOQSY5CNiL7nHk4/6/9yvAl6tq60BsnDn9E/DCNn0UcNvAsnH+TD2lfd2jtf3AVG2B+TcSGajSF9H9hXQz8L+B5Txc4ZcDNwDX012x9C8tvpruP+mXgH8ADqS7YuJauop/He1qiyn6/XW6Xz4P0v1FdNksyGkL3cm6ibYfmAU5/U1b/wbgE3THcsf+77dLDnfQrs4a8/fqr4Ab27Y3AAfMgpz2Av66beNa4Khx59S2cT4DVzuNOyfgP9Kd97geuBI4bJbk9RrgH9vn7bTR0lQfn50lSeptPh7OkiTNkPl4ie/QJXkfcOQu4XdV1YfGkQ+Y06MxG/Myp+kxp+mbqbw8nCVJ6s3DWZKk3iwikqTeLCLSDEqyMMmrRtDP8UkOfuSW0nBZRKSZtRCYdhFJp8//w+N5+DEX0th4Yl2aQUnW091l/xXgs8BzgH3onnP0J1V1SZJldI+TvxI4jO4x4KcALwO20938eU1VvTPJM4D30T025Lt0j7LYF/h7usdm3Af8ZlV9dVT7KA3yEl9pZp0BHFJVz0uyJ93zyO5vj53YlGRDa7ccWF1Vm5L8At0zjZ5LV2yu5eEnuK6lu8v6tiSHA++vqqPadv6+qi4e5c5Ju7KISMMT4H8k+SXgh8BiYP+27OtVtalNHwlcUlUPAA8k+QRAkifQPR78YwMPwd17VMlL02ERkYbnt+kOQx1WVd9PcgfwuLbsX6ex/h7At6vqecNJT3rsPLEuzazv0L2DAbqnrN7TCsiLgJ+ZYp0vAL+W5HFt9PGr8NAbHr+W5ER46CT8cyfpRxobi4g0g6rqW8AXktxE96KoFe2lQafwo4+0H1znaron8N4AfJLuqbz3tcW/DZya5HoefuMcwHrg9Um+1E6+S2Ph1VnSLJDkCVX1L+0NilcAa6rq2nHnJT0Sz4lIs8PadvPg44ALLCCaKxyJSJJ685yIJKk3i4gkqTeLiCSpN4uIJKk3i4gkqbf/DyEhpDZJlO9zAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 图形可视化,查看数据分布\n",
    "import seaborn as sns\n",
    "\n",
    "sns.countplot(data.target)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由上图可以看出,该数据类别不均衡,所以需要后期处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据基本处理\n",
    "\n",
    "数据已经经过脱敏,不再需要特殊处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 截取部分数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 95)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new1_data = data[:10000]\n",
    "new1_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/python/.local/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n",
      "  FutureWarning\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEHCAYAAABfkmooAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWs0lEQVR4nO3df7BfdZ3f8eeL34s/SIDbLCZoqGZ3B60i3AKrrbNC5VddQ12h+Issm2nWKd1d3Y5baHcai0tHZ21Zf9JJl2hwrIi4LFmXFTNRa7sjSEDkpzQRRZIBcpcE/MGgBt/94/u58jXkei7hnu9Ncp+Pme98z3mfzznnc2e+5MU553POSVUhSdIvs99sd0CStOczLCRJnQwLSVInw0KS1MmwkCR1OmC2O9CHI488shYvXjzb3ZCkvcott9zyD1U1tqtl+2RYLF68mA0bNsx2NyRpr5Lk/qmWeRpKktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnXoNiyTvSnJXkjuTfDrJIUmOSXJTkk1JPpPkoNb24Da/qS1fPLSdi1v93iSn99lnSdLT9RYWSRYCfwiMV9XLgP2B84D3A5dV1UuA7cDytspyYHurX9bakeTYtt5LgTOAjyXZv69+S5Keru87uA8AfiXJT4FDgQeBU4C3tOVrgPcAlwNL2zTANcBHkqTVr6qqHwPfSbIJOBH4Ws99l/ZI37vkn8x2F7QHeuF/vqPX7fd2ZFFVW4APAN9jEBKPAbcAj1bVjtZsM7CwTS8EHmjr7mjtjxiu72IdSdII9Hkaaj6Do4JjgBcAz2FwGqmv/a1IsiHJhomJib52I0lzUp8XuP8F8J2qmqiqnwJ/BbwamJdk8vTXImBLm94CHA3Qlh8GPDJc38U6P1dVq6pqvKrGx8Z2+dBESdJu6jMsvgecnOTQdu3hVOBu4MvAm1qbZcB1bXptm6ct/1JVVauf10ZLHQMsAb7eY78lSTvp7QJ3Vd2U5BrgVmAH8A1gFfC3wFVJ/qzVrmirXAF8sl3A3sZgBBRVdVeSqxkEzQ7gwqp6sq9+S5KertfRUFW1Eli5U/k+BqOZdm77BHDOFNu5FLh0xjsoSZoW7+CWJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ0MC0lSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR16i0skvx6ktuGPt9P8s4khydZl2Rj+57f2ifJh5JsSnJ7kuOHtrWstd+YZNnUe5Uk9aG3sKiqe6vquKo6DjgBeBy4FrgIWF9VS4D1bR7gTGBJ+6wALgdIcjiDV7OexOB1rCsnA0aSNBqjOg11KvDtqrofWAqsafU1wNlteilwZQ3cCMxLchRwOrCuqrZV1XZgHXDGiPotSWJ0YXEe8Ok2vaCqHmzTDwEL2vRC4IGhdTa32lT1X5BkRZINSTZMTEzMZN8lac7rPSySHAS8AfjszsuqqoCaif1U1aqqGq+q8bGxsZnYpCSpGcWRxZnArVX1cJt/uJ1eon1vbfUtwNFD6y1qtanqkqQRGUVYvJmnTkEBrAUmRzQtA64bqp/fRkWdDDzWTlfdAJyWZH67sH1aq0mSRuSAPjee5DnA64DfHyq/D7g6yXLgfuDcVr8eOAvYxGDk1AUAVbUtyXuBm1u7S6pqW5/9liT9ol7Doqp+BByxU+0RBqOjdm5bwIVTbGc1sLqPPkqSunkHtySpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqVOvYZFkXpJrknwryT1JfjPJ4UnWJdnYvue3tknyoSSbktye5Pih7Sxr7TcmWTb1HiVJfej7yOKDwBeq6jeAVwD3ABcB66tqCbC+zQOcCSxpnxXA5QBJDgdWAicBJwIrJwNGkjQavYVFksOA1wBXAFTVT6rqUWApsKY1WwOc3aaXAlfWwI3AvCRHAacD66pqW1VtB9YBZ/TVb0nS0/V5ZHEMMAF8PMk3kvxlkucAC6rqwdbmIWBBm14IPDC0/uZWm6r+C5KsSLIhyYaJiYkZ/lMkaW7rMywOAI4HLq+qVwI/4qlTTgBUVQE1EzurqlVVNV5V42NjYzOxSUlS02dYbAY2V9VNbf4aBuHxcDu9RPve2pZvAY4eWn9Rq01VlySNSG9hUVUPAQ8k+fVWOhW4G1gLTI5oWgZc16bXAue3UVEnA4+101U3AKclmd8ubJ/WapKkETmg5+3/AfCpJAcB9wEXMAioq5MsB+4Hzm1trwfOAjYBj7e2VNW2JO8Fbm7tLqmqbT33W5I0pNewqKrbgPFdLDp1F20LuHCK7awGVs9o5yRJ0+Yd3JKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI69RoWSb6b5I4ktyXZ0GqHJ1mXZGP7nt/qSfKhJJuS3J7k+KHtLGvtNyZZNtX+JEn9GMWRxWur6riqmnxj3kXA+qpaAqxv8wBnAkvaZwVwOQzCBVgJnAScCKycDBhJ0mjMxmmopcCaNr0GOHuofmUN3AjMS3IUcDqwrqq2VdV2YB1wxoj7LElzWt9hUcAXk9ySZEWrLaiqB9v0Q8CCNr0QeGBo3c2tNlVdkjQiB/S8/X9WVVuS/CNgXZJvDS+sqkpSM7GjFkYrAF74whfOxCYlSU2vRxZVtaV9bwWuZXDN4eF2eon2vbU13wIcPbT6olabqr7zvlZV1XhVjY+Njc30nyJJc1pvYZHkOUmeNzkNnAbcCawFJkc0LQOua9NrgfPbqKiTgcfa6aobgNOSzG8Xtk9rNUnSiPR5GmoBcG2Syf38r6r6QpKbgauTLAfuB85t7a8HzgI2AY8DFwBU1bYk7wVubu0uqaptPfZbkrST3sKiqu4DXrGL+iPAqbuoF3DhFNtaDaye6T5KkqbHO7glSZ0MC0lSJ8NCktRpWmGRZP10apKkfdMvvcCd5BDgUODINmw1bdHz8S5qSZozukZD/T7wTuAFwC08FRbfBz7SX7ckSXuSXxoWVfVB4INJ/qCqPjyiPkmS9jDTus+iqj6c5FXA4uF1qurKnvolSdqDTCssknwSeDFwG/BkKxdgWEjSHDDdO7jHgWPbXdaSpDlmuvdZ3An8ap8dkSTtuaZ7ZHEkcHeSrwM/nixW1Rt66ZUkaY8y3bB4T5+dkCTt2aY7Gup/990RSdKea7qjoX7AYPQTwEHAgcCPqur5fXVMkrTnmO6RxfMmpzN4m9FS4OS+OiVJ2rM846fO1sBfA6fPfHckSXui6Z6GeuPQ7H4M7rt4Yprr7g9sALZU1euTHANcBRzB4HlTb6+qnyQ5mMFNficAjwD/uqq+27ZxMbCcwQ2Bf1hVvoNbkkZoukcWvz30OR34AYNTUdPxR8A9Q/PvBy6rqpcA2xmEAO17e6tf1tqR5FjgPOClwBnAx1oASZJGZLrXLC7YnY0nWQT8S+BS4I/b9Y5TgLe0JmsYDMu9nEH4vKfVrwE+MnR95Kqq+jHwnSSbgBOBr+1OnyRJz9x0X360KMm1Sba2z+daEHT5C+BPgJ+1+SOAR6tqR5vfzFPvxVgIPADQlj/W2v+8vot1hvu4IsmGJBsmJiam82dJkqZpuqehPg6sZfBeixcAf9NqU0ryemBrVd3yrHo4TVW1qqrGq2p8bGxsFLuUpDljumExVlUfr6od7fMJoOtf5FcDb0jyXQYXtE8BPgjMSzJ5+msRsKVNbwGOBmjLD2Nwofvn9V2sI0kagemGxSNJ3pZk//Z5G4N/yKdUVRdX1aKqWszgAvWXquqtwJeBN7Vmy4Dr2vTaNk9b/qX2lNu1wHlJDm4jqZYAX59mvyVJM2C6YfF7wLnAQ8CDDP4x/93d3Od/YHCxexODaxJXtPoVwBGt/sfARQBVdRdwNXA38AXgwqp68mlblST1ZroPErwEWFZV2wGSHA58gEGIdKqqrwBfadP3MRjNtHObJ4Bzplj/UgYjqiRJs2C6RxYvnwwKgKraBryyny5JkvY00w2L/ZLMn5xpRxbTPSqRJO3lpvsP/n8Dvpbks23+HDwtJElzxnTv4L4yyQYGw18B3lhVd/fXLUnSnmTap5JaOBgQkjQHPeNHlEuS5h7DQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ0MC0lSJ8NCktTJsJAkdTIsJEmdeguLJIck+XqSbya5K8l/afVjktyUZFOSzyQ5qNUPbvOb2vLFQ9u6uNXvTXJ6X32WJO1an0cWPwZOqapXAMcBZyQ5GXg/cFlVvQTYDixv7ZcD21v9staOJMcC5wEvBc4APpZk/x77LUnaSW9hUQM/bLMHtk8xeCfGNa2+Bji7TS9t87TlpyZJq19VVT+uqu8Am9jFO7wlSf3p9ZpFkv2T3AZsBdYB3wYeraodrclmYGGbXgg8ANCWPwYcMVzfxTrD+1qRZEOSDRMTEz38NZI0d/UaFlX1ZFUdByxicDTwGz3ua1VVjVfV+NjYWF+7kaQ5aSSjoarqUeDLwG8C85JMvqFvEbClTW8BjgZoyw8DHhmu72IdSdII9DkaaizJvDb9K8DrgHsYhMabWrNlwHVtem2bpy3/UlVVq5/XRksdAywBvt5XvyVJTzftd3DvhqOANW3k0n7A1VX1+SR3A1cl+TPgG8AVrf0VwCeTbAK2MRgBRVXdleRqBu//3gFcWFVP9thvSdJOeguLqrodeOUu6vexi9FMVfUEcM4U27oUuHSm+yhJmh7v4JYkdTIsJEmdDAtJUifDQpLUybCQJHUyLCRJnQwLSVInw0KS1MmwkCR1MiwkSZ0MC0lSJ8NCktTJsJAkdTIsJEmdDAtJUifDQpLUqc/Xqh6d5MtJ7k5yV5I/avXDk6xLsrF9z2/1JPlQkk1Jbk9y/NC2lrX2G5Msm2qfkqR+9HlksQP491V1LHAycGGSY4GLgPVVtQRY3+YBzmTwfu0lwArgchiEC7ASOInBG/ZWTgaMJGk0eguLqnqwqm5t0z8A7gEWAkuBNa3ZGuDsNr0UuLIGbgTmJTkKOB1YV1Xbqmo7sA44o69+S5KebiTXLJIsZvA+7puABVX1YFv0ELCgTS8EHhhabXOrTVWXJI1I72GR5LnA54B3VtX3h5dVVQE1Q/tZkWRDkg0TExMzsUlJUnNAnxtPciCDoPhUVf1VKz+c5KiqerCdZtra6luAo4dWX9RqW4Df2qn+lZ33VVWrgFUA4+PjzzqATnj3lc92E9oH3fLn5892F6RZ0edoqABXAPdU1X8fWrQWmBzRtAy4bqh+fhsVdTLwWDtddQNwWpL57cL2aa0mSRqRPo8sXg28HbgjyW2t9h+B9wFXJ1kO3A+c25ZdD5wFbAIeBy4AqKptSd4L3NzaXVJV23rstyRpJ72FRVX9XyBTLD51F+0LuHCKba0GVs9c7yRJz4R3cEuSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjr1+Q7u1Um2JrlzqHZ4knVJNrbv+a2eJB9KsinJ7UmOH1pnWWu/McmyXe1LktSvPo8sPgGcsVPtImB9VS0B1rd5gDOBJe2zArgcBuECrAROAk4EVk4GjCRpdHoLi6r6KrBtp/JSYE2bXgOcPVS/sgZuBOYlOQo4HVhXVduqajuwjqcHkCSpZ6O+ZrGgqh5s0w8BC9r0QuCBoXabW22q+tMkWZFkQ5INExMTM9trSZrjZu0Cd1UVUDO4vVVVNV5V42NjYzO1WUkSow+Lh9vpJdr31lbfAhw91G5Rq01VlySN0KjDYi0wOaJpGXDdUP38NirqZOCxdrrqBuC0JPPbhe3TWk2SNEIH9LXhJJ8Gfgs4MslmBqOa3gdcnWQ5cD9wbmt+PXAWsAl4HLgAoKq2JXkvcHNrd0lV7XzRXJLUs97CoqrePMWiU3fRtoALp9jOamD1DHZNkvQMeQe3JKmTYSFJ6mRYSJI6GRaSpE6GhSSpk2EhSepkWEiSOhkWkqROhoUkqZNhIUnqZFhIkjoZFpKkToaFJKmTYSFJ6mRYSJI6GRaSpE6GhSSp014TFknOSHJvkk1JLprt/kjSXLJXhEWS/YGPAmcCxwJvTnLs7PZKkuaOvSIsgBOBTVV1X1X9BLgKWDrLfZKkOeOA2e7ANC0EHhia3wycNNwgyQpgRZv9YZJ7R9S3ueBI4B9muxN7gnxg2Wx3Qb/I3+aklZmJrbxoqgV7S1h0qqpVwKrZ7se+KMmGqhqf7X5IO/O3OTp7y2moLcDRQ/OLWk2SNAJ7S1jcDCxJckySg4DzgLWz3CdJmjP2itNQVbUjyb8DbgD2B1ZX1V2z3K25xNN72lP52xyRVNVs90GStIfbW05DSZJmkWEhSepkWEiSOhkW+7gkv5rkqiTfTnJLkuuT/FqSO3ve7zlJ7krysySOg9fTzOJv88+TfCvJ7UmuTTKvz/3tKwyLfViSANcCX6mqF1fVCcDFwIIR7P5O4I3AV0ewL+1lZvm3uQ54WVW9HPh/bb/qYFjs214L/LSq/sdkoaq+ydCjU5IsTvJ/ktzaPq9q9aOSfDXJbUnuTPLPk+yf5BNt/o4k75pqx1V1T1X5yBVNZTZ/m1+sqh1t9kYGN/mqw15xn4V228uAWzrabAVeV1VPJFkCfBoYB94C3FBVl7an/h4KHAcsrKqXAXj4rmdhT/lt/h7wmWfe/bnHsNCBwEeSHAc8Cfxaq98MrE5yIPDXVXVbkvuAf5zkw8DfAl+cjQ5rzuj1t5nkPwE7gE/10fl9jaeh9m13ASd0tHkX8DDwCgb/13YQQFV9FXgNg2dwfSLJ+VW1vbX7CvAO4C/76bbmgFn9bSb5XeD1wFvLO5OnxbDYt30JOLg9vh2AJC/nFx/KeBjwYFX9DHg7g8epkORFwMNV9T8Z/Id3fJIjgf2q6nPAnwLHj+bP0D5o1n6bSc4A/gR4Q1U9PrN/1r7Lx33s45K8APgLBv8X9wTwXeCdwLVV9bJ2LvhzQAFfAC6squcmWQa8G/gp8EPgfOD5wMd56n8yLq6qv5tiv/8K+DAwBjwK3FZVp8/8X6i91Sz+NjcBBwOPtNKNVfWOmf779jWGhSSpk6ehJEmdHA2lZyXJR4FX71T+YFV9fDb6I03ytzmzPA0lSerkaShJUifDQpLUybCQdkOSeUn+7Qj2c3aSY/vej9TFsJB2zzxg2mGRgd357+1swLDQrPMCt7QbklwFLAXuBb4MvByYz+B5Rn9aVdclWQzcANzE4MazsxjcQPY2YILBE1ZvqaoPJHkx8FEGNzE+Dvwb4HDg88Bj7fM7VfXtUf2N0jCHzkq75yIG70Q4LskBwKFV9f322Ikbk6xt7ZYAy6rqxiT/FPgdBs8wOhC4laeevLoKeEdVbUxyEvCxqjqlbefzVXXNKP84aWeGhfTsBfivSV4D/AxYyFMv8bm/qm5s068GrquqJ4AnkvwNQJLnAq8CPjt4JxAweByFtMcwLKRn760MTh+dUFU/TfJd4JC27EfTWH8/4NGqOq6f7knPnhe4pd3zA+B5bfowYGsLitcCL5pinb8HfjvJIe1o4vUAVfV94DtJzoGfXwx/xS72I80aw0LaDVX1CPD3Se5k8Ja28SR3MLiA/a0p1rkZWAvcDvwdcAeDC9cwODpZnuSbDN71sLTVrwLeneQb7SK4NCscDSWNUJLnVtUPkxwKfBVYUVW3zna/pC5es5BGa1W7ye4QYI1Bob2FRxaSpE5es5AkdTIsJEmdDAtJUifDQpLUybCQJHX6/8KM/GaqzSsHAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 图形可视化,查看数据分布\n",
    "import seaborn as sns\n",
    "\n",
    "sns.countplot(new1_data.target)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用上面方式获取数据不可行,然后使用随机欠采样获取响应的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 随机欠采样获取数据\n",
    "# 首先需要确定特征值\\标签值\n",
    "\n",
    "y = data[\"target\"]\n",
    "x = data.drop([\"id\", \"target\"], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>feat_10</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0       1       0       0       0       0       0       0       0       0   \n",
       "1       0       0       0       0       0       0       0       1       0   \n",
       "2       0       0       0       0       0       0       0       1       0   \n",
       "3       1       0       0       1       6       1       5       0       0   \n",
       "4       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   feat_10  ...  feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  \\\n",
       "0        0  ...        0        1        0        0        0        0   \n",
       "1        0  ...        0        0        0        0        0        0   \n",
       "2        0  ...        0        0        0        0        0        0   \n",
       "3        1  ...       22        0        1        2        0        0   \n",
       "4        0  ...        0        1        0        0        0        0   \n",
       "\n",
       "   feat_90  feat_91  feat_92  feat_93  \n",
       "0        0        0        0        0  \n",
       "1        0        0        0        0  \n",
       "2        0        0        0        0  \n",
       "3        0        0        0        0  \n",
       "4        1        0        0        0  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Class_1\n",
       "1    Class_1\n",
       "2    Class_1\n",
       "3    Class_1\n",
       "4    Class_1\n",
       "Name: target, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'imblearn'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_4573/3671564806.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 欠采样获取数据\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mimblearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munder_sampling\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRandomUnderSampler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mrus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRandomUnderSampler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'imblearn'"
     ]
    }
   ],
   "source": [
    "# 欠采样获取数据\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "\n",
    "rus = RandomUnderSampler(random_state=0)\n",
    "\n",
    "X_resampled, y_resampled = rus.fit_resample(x, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x.shape, y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_resampled.shape, y_resampled.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图形可视化,查看数据分布\n",
    "import seaborn as sns\n",
    "\n",
    "sns.countplot(y_resampled)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 把标签值转换为数字"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_resampled.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "le = LabelEncoder()\n",
    "y_resampled = le.fit_transform(y_resampled)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_resampled"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分割数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train.shape, y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test.shape, y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.数据获取\n",
    "\n",
    "# 2.数据基本处理\n",
    "\n",
    "    # 2.1 截取部分数据\n",
    "    # 2.2 把标签纸转换为数字\n",
    "    # 2.3 分割数据(使用StratifiedShuffleSplit)\n",
    "    # 2.4 数据标准化\n",
    "    # 2.5 数据pca降维\n",
    "\n",
    "# 3.模型训练\n",
    "    # 3.1 基本模型训练\n",
    "    # 3.2 模型调优\n",
    "        # 3.2.1 调优参数:\n",
    "            # n_estimator,\n",
    "            # max_depth,\n",
    "            # min_child_weights,\n",
    "            # subsamples,\n",
    "            # consample_bytrees,\n",
    "            # etas\n",
    "        # 3.2.2 确定最后最优参数\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图形可视化\n",
    "import seaborn as sns\n",
    "\n",
    "sns.countplot(y_test)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 通过StratifiedShuffleSplit实现数据分割\n",
    "\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
    "sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)\n",
    "\n",
    "for train_index, test_index in sss.split(X_resampled.values, y_resampled):\n",
    "    print(len(train_index))\n",
    "    print(len(test_index))\n",
    "    \n",
    "    x_train = X_resampled.values[train_index]\n",
    "    x_val = X_resampled.values[test_index]\n",
    "    \n",
    "    y_train = y_resampled[train_index]\n",
    "    y_val = y_resampled[test_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(x_train.shape, x_val.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图形可视化\n",
    "import seaborn as sns\n",
    "\n",
    "sns.countplot(y_val)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(x_train)\n",
    "\n",
    "x_train_scaled = scaler.transform(x_train)\n",
    "x_val_scaled = scaler.transform(x_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据PCA降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train_scaled.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "pca = PCA(n_components=0.9)\n",
    "\n",
    "x_train_pca = pca.fit_transform(x_train_scaled)\n",
    "x_val_pca = pca.transform(x_val_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(x_train_pca.shape, x_val_pca.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 可视化数据降维信息变化程度\n",
    "plt.plot(np.cumsum(pca.explained_variance_ratio_))\n",
    "\n",
    "plt.xlabel(\"元素数量\")\n",
    "plt.ylabel(\"表达信息百分占比\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练\n",
    "## 基本模型训练 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "\n",
    "xgb = XGBClassifier()\n",
    "xgb.fit(x_train_pca, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输出预测值,一定输出带有百分占比的预测值\n",
    "y_pre_proba = xgb.predict_proba(x_val_pca)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pre_proba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# logloss评估\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "log_loss(y_val, y_pre_proba, eps=1e-15, normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb.get_params"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型调优\n",
    "### 确定最优的estimators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores_ne = []\n",
    "n_estimators = [100, 200, 300, 400, 500, 550, 600, 700]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for nes in n_estimators:\n",
    "    print(\"n_estimators:\", nes)\n",
    "    xgb = XGBClassifier(max_depth=3,\n",
    "                        learning_rate=0.1, \n",
    "                        n_estimators=nes, \n",
    "                        objective=\"multi:softprob\", \n",
    "                        n_jobs=-1, \n",
    "                        nthread=4, \n",
    "                        min_child_weight=1,\n",
    "                        subsample=1,\n",
    "                        colsample_bytree=1,\n",
    "                        seed=42)\n",
    "    \n",
    "    xgb.fit(x_train_pca, y_train)\n",
    "    y_pre = xgb.predict_proba(x_val_pca)\n",
    "    score = log_loss(y_val, y_pre)\n",
    "    scores_ne.append(score)\n",
    "    \n",
    "    print(\"每次测试的logloss值是:{}\".format(score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图形化展示相应的logloss值\n",
    "plt.plot(n_estimators, scores_ne, \"o-\")\n",
    "\n",
    "plt.xlabel(\"n_estimators\")\n",
    "plt.ylabel(\"log_loss\")\n",
    "plt.show()\n",
    "\n",
    "print(\"最优的n_estimators值是:{}\".format(n_estimators[np.argmin(scores_ne)]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 确定最优的max_depth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores_md = []\n",
    "max_depths = [1,3,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for md in max_depths:\n",
    "    print(\"max_depth:\", md)\n",
    "    xgb = XGBClassifier(max_depth=md,\n",
    "                        learning_rate=0.1, \n",
    "                        n_estimators=n_estimators[np.argmin(scores_ne)], \n",
    "                        objective=\"multi:softprob\", \n",
    "                        n_jobs=-1, \n",
    "                        nthread=4, \n",
    "                        min_child_weight=1,\n",
    "                        subsample=1,\n",
    "                        colsample_bytree=1,\n",
    "                        seed=42)\n",
    "    \n",
    "    xgb.fit(x_train_pca, y_train)\n",
    "    y_pre = xgb.predict_proba(x_val_pca)\n",
    "    score = log_loss(y_val, y_pre)\n",
    "    scores_md.append(score)\n",
    "    \n",
    "    print(\"每次测试的logloss值是:{}\".format(score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 图形化展示相应的logloss值\n",
    "plt.plot(max_depths, scores_md, \"o-\")\n",
    "\n",
    "plt.xlabel(\"max_depths\")\n",
    "plt.ylabel(\"log_loss\")\n",
    "plt.show()\n",
    "\n",
    "print(\"最优的max_depths值是:{}\".format(max_depths[np.argmin(scores_md)]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 依据上面模式,运行调试下面参数\n",
    "\n",
    "min_child_weights,\n",
    "\n",
    "subsamples,\n",
    "\n",
    "consample_bytrees,\n",
    "\n",
    "etas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb = XGBClassifier(learning_rate =0.1, \n",
    "                    n_estimators=550, \n",
    "                    max_depth=3, \n",
    "                    min_child_weight=3, \n",
    "                    subsample=0.7, \n",
    "                    colsample_bytree=0.7, \n",
    "                    nthread=4, \n",
    "                    seed=42, \n",
    "                    objective='multi:softprob')\n",
    "\n",
    "xgb.fit(x_train_scaled, y_train)\n",
    "\n",
    "y_pre = xgb.predict_proba(x_val_scaled)\n",
    "\n",
    "print(\"测试数据的log_loss值为 : {}\".format(log_loss(y_val, y_pre, eps=1e-15, normalize=True)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "284px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
